1 package

Code

library(tidyverse)
library(ggplot2)
library(tidymodels)
library(rsample)
library(themis)

2 data

Code

library(tidyverse)

hotels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv")


hotel_stays <- hotels %>%
  filter(is_canceled == 0) %>%
  mutate(
    children = case_when(
      children + babies > 0 ~ "children",
      TRUE ~ "none"
    ),
    required_car_parking_spaces = case_when(
      required_car_parking_spaces > 0 ~ "parking",
      TRUE ~ "none"
    )
  ) %>%
  select(-is_canceled, -reservation_status, -babies)

hotel_stays

# A tibble: 75,166 × 29
   hotel   lead_time arrival_date_year arrival_date_month arrival_date_week_nu…¹
   <chr>       <dbl>             <dbl> <chr>                               <dbl>
 1 Resort…       342              2015 July                                   27
 2 Resort…       737              2015 July                                   27
 3 Resort…         7              2015 July                                   27
 4 Resort…        13              2015 July                                   27
 5 Resort…        14              2015 July                                   27
 6 Resort…        14              2015 July                                   27
 7 Resort…         0              2015 July                                   27
 8 Resort…         9              2015 July                                   27
 9 Resort…        35              2015 July                                   27
10 Resort…        68              2015 July                                   27
# ℹ 75,156 more rows
# ℹ abbreviated name: ¹arrival_date_week_number
# ℹ 24 more variables: arrival_date_day_of_month <dbl>,
#   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
#   children <chr>, meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>, …

2.1 EDA

Code

hotel_stays %>%
  count(children)

# A tibble: 2 × 2
  children     n
  <chr>    <int>
1 children  6073
2 none     69093

Code

library(skimr)

skim(hotel_stays)

Data summary
Name	hotel_stays
Number of rows	75166
Number of columns	29
_______________________
Column type frequency:
character	14
Date	1
numeric	14
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
hotel	1	10	12	2
arrival_date_month	1	3	9	12
children	1	4	8	2
meal	1	2	9	5
country	1	2	4	166
market_segment	1	6	13	7
distribution_channel	1	3	9	5
reserved_room_type	1	1	1	9
assigned_room_type	1	1	1	10
deposit_type	1	10	10	3
agent	1	1	4	315
company	1	1	4	332
customer_type	1	5	15	4
required_car_parking_spaces	1	4	7	2

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2015-07-01	2017-09-14	2016-09-01	805

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lead_time	1	79.98	91.11	0.00	9.0	45.0	124	737	▇▂▁▁▁
arrival_date_year	1	2016.15	0.70	2015.00	2016.0	2016.0	2017	2017	▃▁▇▁▆
arrival_date_week_number	1	27.08	13.90	1.00	16.0	28.0	38	53	▆▇▇▇▆
arrival_date_day_of_month	1	15.84	8.78	1.00	8.0	16.0	23	31	▇▇▇▇▆
stays_in_weekend_nights	1	0.93	0.99	0.00	0.0	1.0	2	19	▇▁▁▁▁
stays_in_week_nights	1	2.46	1.92	0.00	1.0	2.0	3	50	▇▁▁▁▁
adults	1	1.83	0.51	0.00	2.0	2.0	2	4	▁▂▇▁▁
is_repeated_guest	1	0.04	0.20	0.00	0.0	0.0	0	1	▇▁▁▁▁
previous_cancellations	1	0.02	0.27	0.00	0.0	0.0	0	13	▇▁▁▁▁
previous_bookings_not_canceled	1	0.20	1.81	0.00	0.0	0.0	0	72	▇▁▁▁▁
booking_changes	1	0.29	0.74	0.00	0.0	0.0	0	21	▇▁▁▁▁
days_in_waiting_list	1	1.59	14.78	0.00	0.0	0.0	0	379	▇▁▁▁▁
adr	1	99.99	49.21	-6.38	67.5	92.5	125	510	▇▆▁▁▁
total_of_special_requests	1	0.71	0.83	0.00	0.0	1.0	1	5	▇▁▁▁▁

Code

hotel_stays %>%
  mutate(arrival_date_month = factor(arrival_date_month,
    levels = month.name
  )) %>%
  count(hotel, arrival_date_month, children) %>%
  group_by(hotel, children) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(arrival_date_month, proportion, fill = children)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = NULL,
    y = "Proportion of hotel stays",
    fill = NULL
  )

2.2 data split

Code

hotels_df <- hotel_stays %>%
  select(
    children, hotel, arrival_date_month, meal, adr, adults,
    required_car_parking_spaces, total_of_special_requests,
    stays_in_week_nights, stays_in_weekend_nights
  ) %>%
  mutate_if(is.character, factor)

Code

library(tidymodels)

set.seed(1234)
hotel_split <- initial_split(hotels_df)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

3 model

3.1 recipe

Code

hotel_rec <- recipe(children ~ ., data = hotel_train) %>%
  step_downsample(children) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric()) %>%
  step_normalize(all_numeric()) %>%
  prep()

Difference on recipe();prep();bake();juice()

3.2 prep the recipe

Code

hotel_rec=hotel_rec %>% prep()

Code

hotel_rec

3.3 bake the train data with preded recipe

Code

train_proc <- bake(hotel_rec, new_data = hotel_train)

Code

train_proc2 <- bake(hotel_rec, new_data = NULL)

Code

train_juice <-juice(hotel_rec)

the difference between train_proc and train_juice is that the train_juice is been down sample.

Code

dim(train_proc)

[1] 56374    23

Code

dim(train_proc2)

[1] 9032   23

Code

dim(train_juice)

[1] 9032   23

Code

train_proc %>%
  count(children)

# A tibble: 2 × 2
  children     n
  <fct>    <int>
1 children  4516
2 none     51858

the juice and train_proc2 target is down sample to 1:1

Code

train_proc2 %>%
  count(children)

# A tibble: 2 × 2
  children     n
  <fct>    <int>
1 children  4516
2 none      4516

Code

train_juice %>%
  count(children)

# A tibble: 2 × 2
  children     n
  <fct>    <int>
1 children  4516
2 none      4516

3.4 bake the test data with preded recipe

Code

test_proc <- bake(hotel_rec, new_data = hotel_test)

juice(pre_recipe,data=NULL) is same as bake(pre_recipe,data=hotel_train) for training data (excepted down sample)

3.5 model

3.5.1 KNN model

Code

knn_spec <- nearest_neighbor() %>%
  set_engine("kknn") %>%
  set_mode("classification")

3.5.2 tree model

Code

tree_spec <- decision_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification")

3.6 workflow

not using workflow in this case

3.7 training

training with train data(baked).

Code

knn_fit <- knn_spec %>%
  fit(children ~ ., data = train_juice)

knn_fit

parsnip model object


Call:
kknn::train.kknn(formula = children ~ ., data = data, ks = min_rows(5,     data, 5))

Type of response variable: nominal
Minimal misclassification: 0.2636182
Best kernel: optimal
Best k: 5

Code

tree_fit <- tree_spec %>%
  fit(children ~ ., data = train_juice)

tree_fit

parsnip model object

n= 9032 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

 1) root 9032 4516 children (0.50000000 0.50000000)  
   2) adr>=0.1301836 3422  808 children (0.76388077 0.23611923) *
   3) adr< 0.1301836 5610 1902 none (0.33903743 0.66096257)  
     6) total_of_special_requests>=0.6368171 1106  472 children (0.57323689 0.42676311) *
     7) total_of_special_requests< 0.6368171 4504 1268 none (0.28152753 0.71847247)  
      14) adults< -2.855322 81    7 children (0.91358025 0.08641975) *
      15) adults>=-2.855322 4423 1194 none (0.26995252 0.73004748) *

3.8 Evaluate

We can build a set of Monte Carlo splits from the downsampled training data and use this set of resamples to estimate the performance of our two models using the fit_resamples() function. This function does not do any tuning of the model parameters; in fact, it does not even keep the models it trains. This function is used for computing performance metrics across some set of resamples like our validation splits. It will fit a model such as knn_spec to each resample and evaluate on the heldout bit from each resample, and then we can collect_metrics() from the result.

Code

#mc_cv default is 25
set.seed(1234)
validation_splits <- mc_cv(train_juice, prop = 0.9, strata = children
                           #,times=3
                           )
validation_splits

# Monte Carlo cross-validation (0.9/0.1) with 25 resamples  using stratification 
# A tibble: 25 × 2
   splits             id        
   <list>             <chr>     
 1 <split [8128/904]> Resample01
 2 <split [8128/904]> Resample02
 3 <split [8128/904]> Resample03
 4 <split [8128/904]> Resample04
 5 <split [8128/904]> Resample05
 6 <split [8128/904]> Resample06
 7 <split [8128/904]> Resample07
 8 <split [8128/904]> Resample08
 9 <split [8128/904]> Resample09
10 <split [8128/904]> Resample10
# ℹ 15 more rows

should input non train model into fit_resamples

Code

knn_res <- knn_spec %>% fit_resamples(
  children ~ .,
  validation_splits,
  control = control_resamples(save_pred = TRUE)
)

Code

knn_res %>%
  collect_metrics()

# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.730    25 0.00271 Preprocessor1_Model1
2 roc_auc  binary     0.796    25 0.00268 Preprocessor1_Model1

Code

tree_res <- tree_spec %>% fit_resamples(
  children ~ .,
  validation_splits,
  control = control_resamples(save_pred = TRUE)
)

Code

tree_res %>%
  collect_metrics()

# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.723    25 0.00234 Preprocessor1_Model1
2 roc_auc  binary     0.742    25 0.00225 Preprocessor1_Model1

Code

knn_res %>%
  unnest(.predictions) %>%
  mutate(model = "kknn") %>%
  bind_rows(tree_res %>%
    unnest(.predictions) %>%
    mutate(model = "rpart")) %>%
  group_by(model) %>%
  roc_curve(children, .pred_children) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
  geom_line(size = 1.5) +
  geom_abline(
    lty = 2, alpha = 0.5,
    color = "gray50",
    size = 1.2
  )

Code

knn_conf <- knn_res %>%
  unnest(.predictions) %>%
  conf_mat(children, .pred_class)

knn_conf

          Truth
Prediction children none
  children     8077 2888
  none         3223 8412

Code

knn_conf %>%
  autoplot()

Code

#knn_fit %>%
#  predict(new_data = test_proc, type = "prob") %>%
#  mutate(truth = hotel_test$children) %>%
#  roc_auc(truth, .pred_children)

4 predictions

Code

predictions= knn_fit%>%
  predict(new_data = test_proc, type = "prob") %>%
  mutate(truth = hotel_test$children) 
 
head(predictions)

# A tibble: 6 × 3
  .pred_children .pred_none truth   
           <dbl>      <dbl> <fct>   
1         0           1     none    
2         0.252       0.748 none    
3         0.775       0.225 children
4         0.0233      0.977 none    
5         0.0233      0.977 none    
6         0.0233      0.977 none

Code

predictions %>%  roc_auc(truth, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.801

5 Reference

https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-11/readme.md

https://juliasilge.com/blog/hotels-recipes/

https://www.tidymodels.org/start/case-study/